| Transit Score® | Description |
|---|---|
| 90–100 | Rider’s Paradise |
| World-class public transportation. | |
| 70–89 | Excellent Transit |
| Transit is convenient for most trips. | |
| 50–69 | Good Transit |
| Many nearby public transportation options. | |
| 25–49 | Some Transit |
| A few nearby public transportation options. | |
| 0–24 | Minimal Transit |
| It is possible to get on a bus. |
ggplot(clean.transit.data, aes(x = z.transit)) +
geom_histogram(binwidth = 1, fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Standardised transit scores in America",
x = "Transit Scores",
y = "Frequency") +
bbplot::bbc_style()+
theme(plot.title = element_text(size = 12),
axis.title.x = element_text(size = 8), # Adjust the x-axis label size
axis.title.y = element_text(size = 8))
ggplot(transit.map) +
geom_sf(mapping = aes(fill = z.transit), colour = NA, inherit.aes = FALSE) +
scale_fill_gradient2(
low = "#2c7bb6", # Blue
mid = "#ffffbf", # Yellow (midpoint)
high = "#d7191c", # Red
midpoint = mean(transit.map$z.transit, na.rm = TRUE),
limits = c(min(transit.map$z.transit, na.rm = TRUE),
max(transit.map$z.transit, na.rm = TRUE)),
label = scales::comma,
na.value = "grey"
) +
labs(title = 'Standardised Transit Distribution') +
guides(fill = guide_legend(keyheight = 0.3)) +
theme_void() +
bbplot::bbc_style() +
theme(
plot.title = element_text(size = 16), # Make the title larger
legend.position = 'bottom',
legend.title = element_blank(),
legend.text = element_text(size = 10), # Make legend text smaller
axis.title = element_text(size = 10), # Make axis labels smaller
axis.text = element_text(size = 8) # Make axis text smaller
)
ggplot(transit.mobility.data, aes(x = date, y = rolling_avg)) +
geom_line(color = "blue") +
labs(title = "Changes in Visitors to Transit Over Time",
x = "Date",
y = "7-day Rolling Average of Mobility (%)") +
theme_minimal()
ggplot(workplace.mobility.data, aes(x = date, y = rolling_avg)) +
geom_line(color = "blue") +
labs(title = "Changes in Visitors to Workplaces Over Time",
x = "Date",
y = "7-day Rolling Average of Mobility (%)") +
theme_minimal()
# Add a type column to each dataframe
grocery.mobility.data <- grocery.mobility.data %>%
mutate(type = "Grocery and Pharmacy")
transit.mobility.data <- transit.mobility.data %>%
mutate(type = "Transit")
workplace.mobility.data <- workplace.mobility.data %>%
mutate(type = "Workplace")
# Combine the dataframes into one
combined_mobility_data <- bind_rows(grocery.mobility.data, transit.mobility.data, workplace.mobility.data)
# Create the ggplot
p <- ggplot(combined_mobility_data, aes(x = date, y = rolling_avg, color = type)) +
geom_line() +
labs(title = "Changes in Mobility Over Time",
x = "Date",
y = "7-day Rolling Average of Mobility (%)",
color = "Mobility Type") +
theme_minimal()
# Convert the ggplot to an interactive plotly plot
interactive_plot <- ggplotly(p)
# Display the interactive plot
interactive_plot
# Create the plot using ggplot
p <- ggplot(climate.state, aes(x = month, y = avg_temp, group = state, color = state)) +
geom_line() +
geom_point() +
labs(title = "Average Temperature Over Each Month Per State",
x = "Month",
y = "Average Temperature") +
theme_minimal()
# Convert the ggplot object to plotly
p <- ggplotly(p, tooltip = c("x", "y", "color"))
# Show the interactive plot
p
month_order <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
# Convert the month variable to a factor with the desired order
climate.map$month <- factor(climate.map$month, levels = month_order)
ggplot(climate.map) +
geom_sf(mapping = aes(fill = temp), colour = NA, inherit.aes = FALSE) +
scale_fill_gradient2(low = "#0072B2",
high = "red",
midpoint = mean(climate.map$temp, na.rm = TRUE),
limits = c(min(climate.map$temp, na.rm = TRUE),
max(climate.map$temp, na.rm = TRUE)),
label = scales::comma,
na.value = "grey") +
labs(title = 'Average Climate distribution') +
guides(fill = guide_legend(keyheight = 0.3)) +
theme_void() +
theme(plot.title = element_text(size = 7),
legend.position = 'bottom',
legend.title = element_blank()) +
facet_wrap(~month) # This line tells ggplot to create separate panels for each unique value in the month column. Each panel will display the map for one month.
The density per county, which measures the number of people living in a given area within a county, is an important factor in the spread of infectious diseases like COVID-19. Higher density means more people are concentrated in a smaller area, increasing the likelihood of person-to-person transmission through close contact. By analysing density per county, we can assess how different population concentrations impact the rate of new infections. This information can help identify areas that might need more stringent public health measures to control the spread of the virus.
As seen below area, with the highest areas tend to have higher number of case of covid.
ggplot(data = county_data_geo) +
geom_sf(aes(fill = log(Density.per.square.mile.of.land.area...Population)), color = NA) +
scale_fill_viridis_c(option = "viridis") +
theme_minimal() +
labs(title = "Population Density by County Standardised",
fill = "Density (per sq. mile)") +
geom_point(data = top_counties, aes(x = long, y = lat, size = new_cases_sum), color = "red", alpha = 0.7) +
scale_size_continuous(name = "Summed New Cases") +
theme(legend.position = "right", axis.title.x = element_blank(), # Remove x-axis title
axis.title.y = element_blank())
Additionally, the county areas below show that within the 20 most dense counties, there is an intersection counties with the highest case number
Density_Cross$Area_Name
## [1] "New York County" "Baltimore city"
Additionally, this extend beyond case numbers and is also true for deaths in counties.
Density_Cross_deaths$Area
## [1] "Kings County" "Queens County" "Cook County"
The age distribution of a population, particularly the proportion of individuals over 65, is essential in studying COVID-19 spread and impact. Older adults are more susceptible to severe illness and complications from COVID-19. They are also more likely to require hospitalisation and intensive care. By examining the share of the population over 65, we can assess the potential burden on healthcare systems and identify regions where protective measures and vaccination campaigns might need to be prioritised to protect this vulnerable group.
Seen below, a visual trend of areas with a higher proportion of 65+ indicates somewhat higher scores of covid cases. For example areas like Florida that historically have high numbers of retirees. However, the is no intersection between the county areas with the highest proportion of 65+ and covid cases.
ggplot(data = county_data_geo) +
geom_sf(aes(fill = log(Total_age85plusr/POP_ESTIMATE_2018)), color = NA) +
scale_fill_viridis_c(option = "viridis") +
theme_minimal() +
labs(title = "Proportion of Population Age 65+ by County Standardised",
fill = "Proportion") +
geom_point(data = top_counties, aes(x = long, y = lat, size = new_cases_sum), color = "red", alpha = 0.7) +
scale_size_continuous(name = "Summed New Cases") +
theme(legend.position = "right", axis.title.x = element_blank(), # Remove x-axis title
axis.title.y = element_blank())
The availability of healthcare resources, such as active physicians per 100,000 population, is a critical factor in managing and mitigating the effects of the COVID-19 pandemic. Areas with a higher number of active physicians can provide better medical care, testing, and treatment, which can help control the spread of the virus and reduce mortality rates. By analysing the distribution of active physicians, we can understand the capacity of different regions to respond to the pandemic and highlight areas that may need additional support and resources to effectively manage the healthcare demands caused by COVID-19.
However, the is no intersection between the county areas with the highest proportion of physicians and covid cases.
# Plot Active Physicians per 100,000
ggplot(data = county_data_geo) +
geom_sf(aes(fill = log(Total.Active.Patient.Care.Physicians.per.100000.Population.2018..AAMC.)), color = NA) +
scale_fill_viridis_c(option = "viridis") +
theme_minimal() +
labs(title = "Active Physicians per 100,000 by County Standardised",
fill = "Physicians/100k") +
geom_point(data = top_counties, aes(x = long, y = lat, size = new_cases_sum), color = "red", alpha = 0.7) +
scale_size_continuous(name = "Summed New Cases") +
theme(legend.position = "right", axis.title.x = element_blank(), # Remove x-axis title
axis.title.y = element_blank())
Active physicians, may be a confounding factor with population density,
as higher dense city-areas may attract more physicians. And as there is
low evidence above that increased physicians means less cases, it
demonstrates that density is a more telling factor of cases then
physicians.
This statement that population density is more significant to contributing to case numbers can be proved statistically.
county_cases_sum_df <- as.data.frame(county_cases_sum)
county.covid_df <- as.data.frame(county.covid)
county.covid_df <- na.omit(county.covid_df)
county.covid_df <- county.covid_df %>%
select(GEOID, pop.density, age.65.plus, active.patient.care.physicians)
model_df = left_join(county_cases_sum_df, county.covid_df, by = 'GEOID')
model_df <- na.omit(model_df)
model_df <- model_df %>%
distinct(GEOID, .keep_all = TRUE)
model_proption65 = lm(new_cases_sum ~ pop.density + age.65.plus + active.patient.care.physicians, model_df)
# Tidy the model output
summary(model_proption65)
##
## Call:
## lm(formula = new_cases_sum ~ pop.density + age.65.plus + active.patient.care.physicians,
## data = model_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27237 -4736 -1631 1343 733266
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.809e+04 5.732e+03 4.900 1.03e-06 ***
## pop.density 1.420e+00 4.452e-01 3.190 0.00144 **
## age.65.plus -1.598e+04 1.722e+04 -0.928 0.35356
## active.patient.care.physicians 2.214e+00 2.317e+01 0.096 0.92386
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 33860 on 2123 degrees of freedom
## Multiple R-squared: 0.005681, Adjusted R-squared: 0.004276
## F-statistic: 4.043 on 3 and 2123 DF, p-value: 0.007056
As seen above, compared to the other two variables, population density is most siginificant to indicating new cases.
Below is an interactive graph that demonstrates displays the top 100 counties and their new covid cases over 2020 - 2021. It can be identified visually the different Covid-waves/influxes in cases.
fig <- plot_ly(county.covid_long_top_df,
x = ~long,
y = ~lat,
frame = ~year_month,
ids = ~GEOID,
type = 'scattergeo',
mode = 'markers',
text = ~county.x, # Set the hover text to the county name
hoverinfo = 'text', # Only display the text (county name) in the hover info
marker = list(
size = ~log(new.cases), # Set the size of the markers based on the normalized new cases
color = 'red', # Optional: Set a constant color for the markers
opacity = 0.6,
line = list(width = 0)
),
name = 'COVID-19 Cases') %>%
layout(
title = "New COVID-19 Cases by County in 2021 for Highest Reporting Counties",
geo = list(
scope = 'usa', # Set the map scope to the USA
projection = list(type = 'albers usa'),
showland = TRUE,
landcolor = 'rgb(217, 217, 217)',
subunitwidth = 1,
countrywidth = 1,
subunitcolor = 'rgb(255, 255, 255)',
countrycolor = 'rgb(255, 255, 255)'
),
margin = list(l = 0, r = 0, t = 30, b = 0) # Adjust the margins if needed
) %>%
animation_opts(frame = 1000, easing = "elastic", redraw = TRUE) %>%
animation_button(x = 1, xanchor = "right", y = 0, yanchor = "bottom")
# Render the plotly object
fig
“county_state” “date” “new.cases”
[4] “cases_14days” “county” “state”
[7] “FIPS” “total.population” “pop.density”
[10] “age.65.plus” “avg.dec.temp” “avg.jan.temp”
[13] “avg.feb.temp” “avg.mar.temp” “avg.apr.temp”
[16] “avg.may.temp” “avg.jun.temp” “avg.jul.temp”
[19] “avg.aug.temp” “avg.sep.temp” “avg.oct.temp”
[22] “avg.nov.temp” “icu.beds” “active.physicians”
[25] “active.patient.care.physicians” “housing.density”
“transit.scores”
[28] “mobility.transit” “mobility.workplaces” “mobility.grocery”
[31] “new.cases.lag” “workplaces.lag14”
model2 <- lm(cases_14days ~ avg.jan.temp + avg.feb.temp + avg.mar.temp + avg.apr.temp + avg.may.temp + avg.jun.temp + avg.jul.temp + avg.aug.temp + avg.sep.temp + avg.oct.temp + avg.nov.temp + avg.dec.temp , covid.data)
summary(model2)
##
## Call:
## lm(formula = cases_14days ~ avg.jan.temp + avg.feb.temp + avg.mar.temp +
## avg.apr.temp + avg.may.temp + avg.jun.temp + avg.jul.temp +
## avg.aug.temp + avg.sep.temp + avg.oct.temp + avg.nov.temp +
## avg.dec.temp, data = covid.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -788 -56 -29 7 38704
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -397.00403 4.06516 -97.66 <2e-16 ***
## avg.jan.temp 1.49125 0.11727 12.72 <2e-16 ***
## avg.feb.temp -7.73632 0.08672 -89.21 <2e-16 ***
## avg.mar.temp 6.24124 0.15992 39.03 <2e-16 ***
## avg.apr.temp 4.16294 0.17651 23.59 <2e-16 ***
## avg.may.temp -5.82414 0.11941 -48.77 <2e-16 ***
## avg.jun.temp -24.30396 0.18125 -134.09 <2e-16 ***
## avg.jul.temp 26.63539 0.18637 142.92 <2e-16 ***
## avg.aug.temp -8.48234 0.16586 -51.14 <2e-16 ***
## avg.sep.temp 2.30451 0.12589 18.31 <2e-16 ***
## avg.oct.temp 8.02431 0.11472 69.95 <2e-16 ***
## avg.nov.temp 9.33308 0.17684 52.78 <2e-16 ***
## avg.dec.temp -3.31985 0.12168 -27.28 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 240.6 on 2379966 degrees of freedom
## (10961 observations deleted due to missingness)
## Multiple R-squared: 0.03495, Adjusted R-squared: 0.03495
## F-statistic: 7183 on 12 and 2379966 DF, p-value: < 2.2e-16
model3 <- lm(cases_14days ~mobility.transit +mobility.grocery +mobility.workplaces + avg.jan.temp + avg.feb.temp + avg.mar.temp + avg.apr.temp + avg.may.temp + avg.jun.temp + avg.jul.temp + avg.aug.temp + avg.sep.temp + avg.oct.temp + avg.nov.temp + avg.dec.temp, covid.data)
summary(model3)
##
## Call:
## lm(formula = cases_14days ~ mobility.transit + mobility.grocery +
## mobility.workplaces + avg.jan.temp + avg.feb.temp + avg.mar.temp +
## avg.apr.temp + avg.may.temp + avg.jun.temp + avg.jul.temp +
## avg.aug.temp + avg.sep.temp + avg.oct.temp + avg.nov.temp +
## avg.dec.temp, data = covid.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -799 -74 -30 18 38597
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -561.34293 6.63602 -84.59 <2e-16 ***
## mobility.transit -1.86760 0.01204 -155.09 <2e-16 ***
## mobility.grocery -0.67687 0.01799 -37.63 <2e-16 ***
## mobility.workplaces -0.91827 0.01813 -50.65 <2e-16 ***
## avg.jan.temp 4.81217 0.20528 23.44 <2e-16 ***
## avg.feb.temp -10.46979 0.15244 -68.68 <2e-16 ***
## avg.mar.temp 8.94395 0.26505 33.74 <2e-16 ***
## avg.apr.temp -3.75114 0.30867 -12.15 <2e-16 ***
## avg.may.temp -6.13216 0.19875 -30.85 <2e-16 ***
## avg.jun.temp -24.58477 0.29336 -83.81 <2e-16 ***
## avg.jul.temp 31.89003 0.32071 99.44 <2e-16 ***
## avg.aug.temp -13.60139 0.30398 -44.74 <2e-16 ***
## avg.sep.temp 10.87912 0.22105 49.22 <2e-16 ***
## avg.oct.temp 4.75981 0.19643 24.23 <2e-16 ***
## avg.nov.temp 10.11356 0.31310 32.30 <2e-16 ***
## avg.dec.temp -2.65084 0.21474 -12.34 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 300 on 1443118 degrees of freedom
## (947806 observations deleted due to missingness)
## Multiple R-squared: 0.06917, Adjusted R-squared: 0.06916
## F-statistic: 7149 on 15 and 1443118 DF, p-value: < 2.2e-16
library(lmtest)
# Durbin-Watson test
dwtest(model3)
##
## Durbin-Watson test
##
## data: model3
## DW = 0.016454, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
# Plot residuals vs fitted values
plot(fitted(model3), residuals(model3), main = "Residuals vs Fitted")
abline(h = 0, col = "red")
# Q-Q plot
qqnorm(residuals(model3))
qqline(residuals(model3), col = "red")